From d0dc1f52cc41a6c5a881123056e97cf4dfbebea4 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Thu, 19 Mar 2015 16:29:32 +0100 Subject: [PATCH] gtkcairoblur: Unroll inner loop for common radius values This unrolls the inner blur loop for radius 1-10, allowing the compiler to use a divide-by-constant operation instead of a generic division. Here is the blur-performance output before: Radius 1: 124.95 msec, 32.01 kpixels/msec: Radius 2: 117.27 msec, 34.11 kpixels/msec: Radius 3: 123.57 msec, 32.37 kpixels/msec: Radius 4: 118.17 msec, 33.85 kpixels/msec: Radius 5: 119.32 msec, 33.52 kpixels/msec: Radius 6: 124.17 msec, 32.21 kpixels/msec: Radius 7: 121.04 msec, 33.05 kpixels/msec: Radius 8: 130.64 msec, 30.62 kpixels/msec: Radius 9: 119.47 msec, 33.48 kpixels/msec: Radius 10: 117.95 msec, 33.91 kpixels/msec: Radius 11: 122.38 msec, 32.68 kpixels/msec: Radius 12: 121.92 msec, 32.81 kpixels/msec: Radius 13: 125.45 msec, 31.89 kpixels/msec: Radius 14: 121.63 msec, 32.89 kpixels/msec: Radius 15: 120.18 msec, 33.28 kpixels/msec: And after: Radius 1: 42.26 msec, 94.65 kpixels/msec: Radius 2: 59.15 msec, 67.62 kpixels/msec: Radius 3: 60.29 msec, 66.35 kpixels/msec: Radius 4: 64.53 msec, 61.99 kpixels/msec: Radius 5: 60.07 msec, 66.59 kpixels/msec: Radius 6: 62.43 msec, 64.07 kpixels/msec: Radius 7: 60.36 msec, 66.27 kpixels/msec: Radius 8: 59.59 msec, 67.13 kpixels/msec: Radius 9: 76.17 msec, 52.51 kpixels/msec: Radius 10: 79.41 msec, 50.37 kpixels/msec: Radius 11: 118.92 msec, 33.64 kpixels/msec: Radius 12: 121.31 msec, 32.97 kpixels/msec: Radius 13: 118.30 msec, 33.81 kpixels/msec: Radius 14: 116.82 msec, 34.24 kpixels/msec: Radius 15: 116.99 msec, 34.19 kpixels/msec: I.e. almost double performance for the unrolled radius values. https://bugzilla.gnome.org/show_bug.cgi?id=746468 --- gtk/gtkcairoblur.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/gtk/gtkcairoblur.c b/gtk/gtkcairoblur.c index 82f5d33f48..170f460bf2 100644 --- a/gtk/gtkcairoblur.c +++ b/gtk/gtkcairoblur.c @@ -73,18 +73,37 @@ blur_xspan (guchar *row, * only divide down after all three passes. (SSE parallel implementation * of the divide step is possible.) */ - for (i = -d + offset; i < row_width + offset; i++) - { - if (i >= 0 && i < row_width) - sum += row[i]; - - if (i >= offset) - { - if (i >= d) - sum -= row[i - d]; - tmp_buffer[i - offset] = (sum + d / 2) / d; - } +#define BLUR_ROW_KERNEL(D) \ + for (i = -(D) + offset; i < row_width + offset; i++) \ + { \ + if (i >= 0 && i < row_width) \ + sum += row[i]; \ + \ + if (i >= offset) \ + { \ + if (i >= (D)) \ + sum -= row[i - (D)]; \ + \ + tmp_buffer[i - offset] = (sum + (D) / 2) / (D); \ + } \ + } \ + break; + + /* We unroll the values for d for radius 2-10 to avoid a generic + * divide operation (not radius 1, because its a no-op) */ + switch (d) + { + case get_box_filter_size (2): BLUR_ROW_KERNEL (get_box_filter_size (2)); + case get_box_filter_size (3): BLUR_ROW_KERNEL (get_box_filter_size (3)); + case get_box_filter_size (4): BLUR_ROW_KERNEL (get_box_filter_size (4)); + case get_box_filter_size (5): BLUR_ROW_KERNEL (get_box_filter_size (5)); + case get_box_filter_size (6): BLUR_ROW_KERNEL (get_box_filter_size (6)); + case get_box_filter_size (7): BLUR_ROW_KERNEL (get_box_filter_size (7)); + case get_box_filter_size (8): BLUR_ROW_KERNEL (get_box_filter_size (8)); + case get_box_filter_size (9): BLUR_ROW_KERNEL (get_box_filter_size (9)); + case get_box_filter_size (10): BLUR_ROW_KERNEL (get_box_filter_size (10)); + default: BLUR_ROW_KERNEL (d); } memcpy (row, tmp_buffer, row_width); -- 2.30.2